In [ ]:
#Importing the libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
In [ ]:
from google.colab import files
files = files.upload()
Upload widget is only available when the cell has been executed in the current browser session. Please rerun this cell to enable.
In [ ]:
# Load the dataset into a dataframe

df = pd.read_csv("NYC_listings.csv")
df
<ipython-input-3-a3d3d522f803>:3: DtypeWarning: Columns (68) have mixed types. Specify dtype option on import or set low_memory=False.
  df = pd.read_csv("NYC_listings.csv")
Out[ ]:
id listing_url scrape_id last_scraped source name description neighborhood_overview picture_url host_id ... review_scores_communication review_scores_location review_scores_value license instant_bookable calculated_host_listings_count calculated_host_listings_count_entire_homes calculated_host_listings_count_private_rooms calculated_host_listings_count_shared_rooms reviews_per_month
0 2595 https://www.airbnb.com/rooms/2595 20221204162430 2022-12-05 city scrape Skylit Midtown Castle Beautiful, spacious skylit studio in the heart... Centrally located in the heart of Manhattan ju... https://a0.muscache.com/pictures/f0813a11-40b2... 2845 ... 4.80 4.81 4.40 NaN f 3 3 0 0 0.31
1 5203 https://www.airbnb.com/rooms/5203 20221204162430 2022-12-05 previous scrape Cozy Clean Guest Room - Family Apt Our best guests are seeking a safe, clean, spa... Our neighborhood is full of restaurants and ca... https://a0.muscache.com/pictures/103776/b37157... 7490 ... 4.95 4.94 4.92 NaN f 1 0 1 0 0.73
2 5136 https://www.airbnb.com/rooms/5136 20221204162430 2022-12-04 city scrape Spacious Brooklyn Duplex, Patio + Garden We welcome you to stay in our lovely 2 br dupl... NaN https://a0.muscache.com/pictures/miso/Hosting-... 7378 ... 5.00 4.67 5.00 NaN f 1 1 0 0 0.03
3 5121 https://www.airbnb.com/rooms/5121 20221204162430 2022-12-05 city scrape BlissArtsSpace! One room available for rent in a 2 bedroom apt... NaN https://a0.muscache.com/pictures/2090980c-b68e... 7356 ... 4.91 4.47 4.52 NaN f 2 0 2 0 0.30
4 6848 https://www.airbnb.com/rooms/6848 20221204162430 2022-12-05 city scrape Only 2 stops to Manhattan studio Comfortable studio apartment with super comfor... NaN https://a0.muscache.com/pictures/e4f031a7-f146... 15991 ... 4.80 4.67 4.56 NaN f 1 1 0 0 1.13
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
41528 772683159414917117 https://www.airbnb.com/rooms/772683159414917117 20221204162430 2022-12-05 city scrape Dahiari Desconecta de tus preocupaciones en este espac... NaN https://a0.muscache.com/pictures/miso/Hosting-... 125534010 ... NaN NaN NaN NaN f 1 1 0 0 NaN
41529 772705452516314073 https://www.airbnb.com/rooms/772705452516314073 20221204162430 2022-12-05 city scrape Beautiful Basement Your family will be close to everything when y... NaN https://a0.muscache.com/pictures/miso/Hosting-... 338424773 ... NaN NaN NaN NaN t 1 0 1 0 NaN
41530 772710779275911753 https://www.airbnb.com/rooms/772710779275911753 20221204162430 2022-12-05 city scrape Central Park Close By - 24 This is a Three-Bedroom Apartment. You will ha... NaN https://a0.muscache.com/pictures/miso/Hosting-... 2653479 ... NaN NaN NaN NaN t 37 2 35 0 NaN
41531 772714221060214808 https://www.airbnb.com/rooms/772714221060214808 20221204162430 2022-12-04 city scrape Good Vibes at The Bronx Keep it simple at this peaceful and centrally-... NaN https://a0.muscache.com/pictures/miso/Hosting-... 421264574 ... NaN NaN NaN NaN t 1 1 0 0 NaN
41532 772716724205003579 https://www.airbnb.com/rooms/772716724205003579 20221204162430 2022-12-05 city scrape 2 bedroom Condo near West Village This beautifully decorated condo will give you... NaN https://a0.muscache.com/pictures/miso/Hosting-... 481177884 ... NaN NaN NaN NaN t 2 2 0 0 NaN

41533 rows × 75 columns

In [ ]:
df.head(30)
Out[ ]:
id listing_url scrape_id last_scraped source name description neighborhood_overview picture_url host_id ... review_scores_communication review_scores_location review_scores_value license instant_bookable calculated_host_listings_count calculated_host_listings_count_entire_homes calculated_host_listings_count_private_rooms calculated_host_listings_count_shared_rooms reviews_per_month
0 2595 https://www.airbnb.com/rooms/2595 20221204162430 2022-12-05 city scrape Skylit Midtown Castle Beautiful, spacious skylit studio in the heart... Centrally located in the heart of Manhattan ju... https://a0.muscache.com/pictures/f0813a11-40b2... 2845 ... 4.80 4.81 4.40 NaN f 3 3 0 0 0.31
1 5203 https://www.airbnb.com/rooms/5203 20221204162430 2022-12-05 previous scrape Cozy Clean Guest Room - Family Apt Our best guests are seeking a safe, clean, spa... Our neighborhood is full of restaurants and ca... https://a0.muscache.com/pictures/103776/b37157... 7490 ... 4.95 4.94 4.92 NaN f 1 0 1 0 0.73
2 5136 https://www.airbnb.com/rooms/5136 20221204162430 2022-12-04 city scrape Spacious Brooklyn Duplex, Patio + Garden We welcome you to stay in our lovely 2 br dupl... NaN https://a0.muscache.com/pictures/miso/Hosting-... 7378 ... 5.00 4.67 5.00 NaN f 1 1 0 0 0.03
3 5121 https://www.airbnb.com/rooms/5121 20221204162430 2022-12-05 city scrape BlissArtsSpace! One room available for rent in a 2 bedroom apt... NaN https://a0.muscache.com/pictures/2090980c-b68e... 7356 ... 4.91 4.47 4.52 NaN f 2 0 2 0 0.30
4 6848 https://www.airbnb.com/rooms/6848 20221204162430 2022-12-05 city scrape Only 2 stops to Manhattan studio Comfortable studio apartment with super comfor... NaN https://a0.muscache.com/pictures/e4f031a7-f146... 15991 ... 4.80 4.67 4.56 NaN f 1 1 0 0 1.13
5 5178 https://www.airbnb.com/rooms/5178 20221204162430 2022-12-05 city scrape Large Furnished Room Near B'way Please don’t expect the luxury here just a bas... Theater district, many restaurants around here. https://a0.muscache.com/pictures/12065/f070997... 8967 ... 4.45 4.88 4.39 NaN f 1 0 1 0 3.38
6 6990 https://www.airbnb.com/rooms/6990 20221204162430 2022-12-05 city scrape UES Beautiful Blue Room Beautiful peaceful healthy home<br /><br /><b>... Location: Five minutes to Central Park, Museum... https://a0.muscache.com/pictures/be6cd5b3-9295... 16800 ... 4.95 4.84 4.85 NaN t 1 0 1 0 1.52
7 6872 https://www.airbnb.com/rooms/6872 20221204162430 2022-12-05 city scrape Uptown Sanctuary w/ Private Bath (Month to Month) A charming month-to-month home away from home ... This sweet Harlem sanctuary is a 10-20 minute ... https://a0.muscache.com/pictures/miso/Hosting-... 16104 ... 5.00 5.00 5.00 NaN f 2 0 2 0 0.16
8 7097 https://www.airbnb.com/rooms/7097 20221204162430 2022-12-04 city scrape Perfect for Your Parents: Privacy + Garden Parents/grandparents coming to town, or just h... Residential, village-like atmosphere. Lots of ... https://a0.muscache.com/pictures/miso/Hosting-... 17571 ... 4.92 4.94 4.81 NaN t 2 1 1 0 2.01
9 7064 https://www.airbnb.com/rooms/7064 20221204162430 2022-12-05 city scrape Amazing location! Wburg. Large, bright & tranquil Large, private loft-like room in a spacious 2-... - One stop from the East Village, Lower East S... https://a0.muscache.com/pictures/13708959/7e74... 17297 ... 5.00 5.00 5.00 NaN f 2 0 2 0 0.09
10 8490 https://www.airbnb.com/rooms/8490 20221204162430 2022-12-05 city scrape Maison des Sirenes1,bohemian, luminous apartment <b>The space</b><br />I am the lucky owner of ... NaN https://a0.muscache.com/pictures/1c51369e-a251... 25183 ... 4.87 4.65 4.75 NaN f 2 2 0 0 1.01
11 7801 https://www.airbnb.com/rooms/7801 20221204162430 2022-12-05 city scrape Sweet and Spacious Brooklyn Loft A true open-plan loft in a repurposed factory ... We've lived here for over 10 years and watched... https://a0.muscache.com/pictures/207102/56d6fc... 21207 ... 4.60 5.00 4.80 NaN f 1 1 0 0 0.06
12 9357 https://www.airbnb.com/rooms/9357 20221204162430 2022-12-05 previous scrape Midtown Pied-a-terre HELLO. PLEASE DO NOT HIT "REQUEST TO BOOK". H... Quiet residential block near many restaurants ... https://a0.muscache.com/pictures/90036/4e60665... 30193 ... 5.00 4.95 4.58 NaN f 1 1 0 0 0.36
13 5803 https://www.airbnb.com/rooms/5803 20221204162430 2022-12-04 city scrape Lovely Room 1 in BEST AREA; Legal Rental, Spot... Beautiful house, gorgeous garden, large patio,... Neighborhood is amazing!<br />Best subways to ... https://a0.muscache.com/pictures/2884180/f19a1... 9744 ... 4.83 4.87 4.74 NaN f 3 1 2 0 1.31
14 10962 https://www.airbnb.com/rooms/10962 20221204162430 2022-12-04 city scrape Lovely Room 2 in BEST AREA; Legal Rental, Spot... Lovely room, gorgeous garden, helpful host in... Neighborhood is wonderful, a great walking nei... https://a0.muscache.com/pictures/2885219/f762f... 9744 ... 4.78 4.88 4.74 NaN f 3 1 2 0 1.37
15 9704 https://www.airbnb.com/rooms/9704 20221204162430 2022-12-05 city scrape Spacious 1 bedroom in luxe building The room is spacious, the neighborhood is safe... NaN https://a0.muscache.com/pictures/38418/569b54f... 32045 ... 4.84 4.84 4.90 NaN f 1 0 1 0 0.95
16 12192 https://www.airbnb.com/rooms/12192 20221204162430 2022-12-05 city scrape ENJOY Downtown NYC! Please be vaccinated and responsible if you ar... Enjoy great food, music, unique shops, night-l... https://a0.muscache.com/pictures/miso/Hosting-... 46978 ... 4.85 4.69 4.52 NaN f 2 0 2 0 1.82
17 11943 https://www.airbnb.com/rooms/11943 20221204162430 2022-12-05 previous scrape Country space in the city <b>The space</b><br />Ditmas Park. Entire 3rd ... NaN https://a0.muscache.com/pictures/53007/d30884b... 45445 ... NaN NaN NaN NaN f 1 0 1 0 NaN
18 12940 https://www.airbnb.com/rooms/12940 20221204162430 2022-12-05 city scrape Charming Brownstone 3 - Near PRATT Super cute 1 bedroom apartment in a 100 year o... Multicultural melting pot. Lots of cafes, bar... https://a0.muscache.com/pictures/miso/Hosting-... 50148 ... 4.52 4.03 4.36 NaN f 1 1 0 0 0.46
19 12937 https://www.airbnb.com/rooms/12937 20221204162430 2022-12-05 city scrape 1 Stop fr. Manhattan! Private Suite,Landmark B... Private room, dedicated bath and a separate en... Long Island City is the hottest neighborhood i... https://a0.muscache.com/pictures/10f2783b-5e8e... 50124 ... 4.91 4.90 4.86 NaN f 1 0 1 0 2.30
20 10452 https://www.airbnb.com/rooms/10452 20221204162430 2022-12-05 city scrape Large B&B Style rooms Great location.<br /><br /><b>The space</b><br... NaN https://a0.muscache.com/pictures/16336315/c4bf... 35935 ... 4.84 4.39 4.64 NaN f 5 0 5 0 0.53
21 31130 https://www.airbnb.com/rooms/31130 20221204162430 2022-12-05 city scrape Most Central Location! The bedroom is set up completely for you. Seco... Central Park, TimeWarner center on Columbus Ci... https://a0.muscache.com/pictures/32ad29f2-419b... 117287 ... 4.95 5.00 4.83 NaN f 4 2 2 0 0.45
22 13808 https://www.airbnb.com/rooms/13808 20221204162430 2022-12-05 city scrape Blue Room for 2 in Brownstone for $1350 monthly Romantic quiet room in a beautiful 1800 Libert... We are in New York! And Brooklyn is the new hi... https://a0.muscache.com/pictures/81099/72ccf5f... 54275 ... 4.86 4.54 4.71 NaN f 4 0 4 0 1.18
23 14290 https://www.airbnb.com/rooms/14290 20221204162430 2022-12-05 city scrape * ORIGINAL BROOKLYN LOFT * Original factory building loft, lots of natur... Bushwick is a constantly changing area, new o... https://a0.muscache.com/pictures/448859/dbf8f1... 56104 ... 4.54 4.74 4.68 NaN f 1 1 0 0 0.97
24 31555 https://www.airbnb.com/rooms/31555 20221204162430 2022-12-05 previous scrape Luminous Beautiful West Village Studio Wonderfully bright, nicely furnished 400-squar... Tree-lined streets, buzzing bar and restaurant... https://a0.muscache.com/pictures/70290811/e989... 135619 ... 4.97 4.93 4.69 NaN f 1 1 0 0 0.22
25 29683 https://www.airbnb.com/rooms/29683 20221204162430 2022-12-05 city scrape Stylish Apartment with office space Near SoHo! Modern 1 bedroom apartment with stylish Scandi... NoHo is perfectly situated downtown Manhattan.... https://a0.muscache.com/pictures/d670b401-5ce5... 125857 ... 4.92 4.90 4.63 NaN f 2 1 1 0 0.77
26 31902 https://www.airbnb.com/rooms/31902 20221204162430 2022-12-05 previous scrape Sanctuary in East Flatbush Come and Stay in a warm and nurturing environm... I love the sweetness and tranquility. I love ... https://a0.muscache.com/pictures/74729149/1220... 137292 ... 4.33 4.33 4.33 NaN f 1 0 1 0 0.03
27 61509 https://www.airbnb.com/rooms/61509 20221204162430 2022-12-05 previous scrape Quiet, clean midtown apt w. elevato This apartment is available until July 30th. S... It is located steps away from Grand central, m... https://a0.muscache.com/pictures/12284324/9fbf... 23619 ... 4.52 4.84 4.41 NaN f 1 1 0 0 0.64
28 62427 https://www.airbnb.com/rooms/62427 20221204162430 2022-12-05 previous scrape Great East Village Apartment Rental Be in the heart of the best neighborhood in NY... NaN https://a0.muscache.com/pictures/381971/8d6dd5... 303882 ... 4.97 4.98 4.79 NaN f 1 1 0 0 0.45
29 14314 https://www.airbnb.com/rooms/14314 20221204162430 2022-12-05 city scrape Greenpoint Place...Has It All! Cozy, comfortable, one bedroom apartment on gr... NaN https://a0.muscache.com/pictures/67332445/1478... 56246 ... 4.93 4.80 4.78 NaN f 1 1 0 0 1.13

30 rows × 75 columns

In [ ]:
plt.figure(figsize=(20,10))
sns.heatmap(df.isna().transpose(),
            cmap="YlGnBu",
            cbar_kws={'label': 'Missing Data'})
plt.savefig("visualizing_missing_data_with_heatmap_Seaborn_Python.png", dpi=100)
In [ ]:
df.shape
Out[ ]:
(41533, 75)
In [ ]:
# Missing value count for each column

df_missing = df.isna().sum()
df_missing = df_missing.sort_values(ascending = False)
print(df_missing.to_markdown())
|                                              |     0 |
|:---------------------------------------------|------:|
| bathrooms                                    | 41533 |
| calendar_updated                             | 41533 |
| license                                      | 41532 |
| host_about                                   | 18312 |
| neighborhood_overview                        | 17444 |
| neighbourhood                                | 17443 |
| host_response_time                           | 13645 |
| host_response_rate                           | 13645 |
| host_acceptance_rate                         | 12211 |
| review_scores_value                          |  9848 |
| review_scores_location                       |  9848 |
| review_scores_checkin                        |  9845 |
| review_scores_accuracy                       |  9841 |
| review_scores_communication                  |  9836 |
| review_scores_cleanliness                    |  9831 |
| reviews_per_month                            |  9393 |
| first_review                                 |  9393 |
| last_review                                  |  9393 |
| review_scores_rating                         |  9393 |
| host_neighbourhood                           |  8189 |
| host_location                                |  7745 |
| bedrooms                                     |  3822 |
| beds                                         |   941 |
| description                                  |   786 |
| bathrooms_text                               |    77 |
| host_is_superhost                            |    29 |
| maximum_nights_avg_ntm                       |    14 |
| minimum_nights_avg_ntm                       |    14 |
| maximum_maximum_nights                       |    14 |
| minimum_maximum_nights                       |    14 |
| maximum_minimum_nights                       |    14 |
| minimum_minimum_nights                       |    14 |
| name                                         |    13 |
| host_name                                    |     5 |
| host_since                                   |     5 |
| host_total_listings_count                    |     5 |
| host_listings_count                          |     5 |
| host_picture_url                             |     5 |
| host_identity_verified                       |     5 |
| host_has_profile_pic                         |     5 |
| host_thumbnail_url                           |     5 |
| listing_url                                  |     0 |
| host_verifications                           |     0 |
| number_of_reviews_l30d                       |     0 |
| host_url                                     |     0 |
| host_id                                      |     0 |
| picture_url                                  |     0 |
| source                                       |     0 |
| last_scraped                                 |     0 |
| scrape_id                                    |     0 |
| instant_bookable                             |     0 |
| calculated_host_listings_count               |     0 |
| calculated_host_listings_count_entire_homes  |     0 |
| calculated_host_listings_count_private_rooms |     0 |
| calculated_host_listings_count_shared_rooms  |     0 |
| number_of_reviews_ltm                        |     0 |
| number_of_reviews                            |     0 |
| calendar_last_scraped                        |     0 |
| availability_365                             |     0 |
| amenities                                    |     0 |
| price                                        |     0 |
| minimum_nights                               |     0 |
| maximum_nights                               |     0 |
| accommodates                                 |     0 |
| room_type                                    |     0 |
| property_type                                |     0 |
| longitude                                    |     0 |
| latitude                                     |     0 |
| neighbourhood_group_cleansed                 |     0 |
| neighbourhood_cleansed                       |     0 |
| has_availability                             |     0 |
| availability_30                              |     0 |
| availability_60                              |     0 |
| availability_90                              |     0 |
| id                                           |     0 |
In [ ]:
# Missing value percentage count for each column

missing_values = df.isnull().mean()*100
missing_values = missing_values.sort_values(ascending = False)
print(missing_values.to_markdown())
|                                              |           0 |
|:---------------------------------------------|------------:|
| bathrooms                                    | 100         |
| calendar_updated                             | 100         |
| license                                      |  99.9976    |
| host_about                                   |  44.0902    |
| neighborhood_overview                        |  42.0003    |
| neighbourhood                                |  41.9979    |
| host_response_time                           |  32.8534    |
| host_response_rate                           |  32.8534    |
| host_acceptance_rate                         |  29.4007    |
| review_scores_value                          |  23.7113    |
| review_scores_location                       |  23.7113    |
| review_scores_checkin                        |  23.704     |
| review_scores_accuracy                       |  23.6944    |
| review_scores_communication                  |  23.6824    |
| review_scores_cleanliness                    |  23.6703    |
| reviews_per_month                            |  22.6158    |
| first_review                                 |  22.6158    |
| last_review                                  |  22.6158    |
| review_scores_rating                         |  22.6158    |
| host_neighbourhood                           |  19.7169    |
| host_location                                |  18.6478    |
| bedrooms                                     |   9.20232   |
| beds                                         |   2.26567   |
| description                                  |   1.89247   |
| bathrooms_text                               |   0.185395  |
| host_is_superhost                            |   0.069824  |
| maximum_nights_avg_ntm                       |   0.0337081 |
| minimum_nights_avg_ntm                       |   0.0337081 |
| maximum_maximum_nights                       |   0.0337081 |
| minimum_maximum_nights                       |   0.0337081 |
| maximum_minimum_nights                       |   0.0337081 |
| minimum_minimum_nights                       |   0.0337081 |
| name                                         |   0.0313004 |
| host_name                                    |   0.0120386 |
| host_since                                   |   0.0120386 |
| host_total_listings_count                    |   0.0120386 |
| host_listings_count                          |   0.0120386 |
| host_picture_url                             |   0.0120386 |
| host_identity_verified                       |   0.0120386 |
| host_has_profile_pic                         |   0.0120386 |
| host_thumbnail_url                           |   0.0120386 |
| listing_url                                  |   0         |
| host_verifications                           |   0         |
| number_of_reviews_l30d                       |   0         |
| host_url                                     |   0         |
| host_id                                      |   0         |
| picture_url                                  |   0         |
| source                                       |   0         |
| last_scraped                                 |   0         |
| scrape_id                                    |   0         |
| instant_bookable                             |   0         |
| calculated_host_listings_count               |   0         |
| calculated_host_listings_count_entire_homes  |   0         |
| calculated_host_listings_count_private_rooms |   0         |
| calculated_host_listings_count_shared_rooms  |   0         |
| number_of_reviews_ltm                        |   0         |
| number_of_reviews                            |   0         |
| calendar_last_scraped                        |   0         |
| availability_365                             |   0         |
| amenities                                    |   0         |
| price                                        |   0         |
| minimum_nights                               |   0         |
| maximum_nights                               |   0         |
| accommodates                                 |   0         |
| room_type                                    |   0         |
| property_type                                |   0         |
| longitude                                    |   0         |
| latitude                                     |   0         |
| neighbourhood_group_cleansed                 |   0         |
| neighbourhood_cleansed                       |   0         |
| has_availability                             |   0         |
| availability_30                              |   0         |
| availability_60                              |   0         |
| availability_90                              |   0         |
| id                                           |   0         |
In [ ]:
df[['maximum_nights_avg_ntm','minimum_nights_avg_ntm','maximum_maximum_nights','minimum_maximum_nights','maximum_minimum_nights','minimum_minimum_nights','minimum_nights','maximum_nights']].head(10)
Out[ ]:
maximum_nights_avg_ntm minimum_nights_avg_ntm maximum_maximum_nights minimum_maximum_nights maximum_minimum_nights minimum_minimum_nights minimum_nights maximum_nights
0 1125.0 30.0 1125.0 1125.0 30.0 30.0 30 1125
1 14.0 2.0 14.0 14.0 2.0 2.0 2 14
2 1125.0 21.0 1125.0 1125.0 21.0 21.0 21 1125
3 730.0 30.0 730.0 730.0 30.0 30.0 30 730
4 1125.0 30.0 1125.0 1125.0 30.0 30.0 30 730
5 14.0 2.0 14.0 14.0 2.0 2.0 2 14
6 1125.0 30.0 1125.0 1125.0 30.0 30.0 30 700
7 180.0 30.0 180.0 180.0 30.0 30.0 30 180
8 1125.0 3.0 1125.0 1125.0 3.0 3.0 3 1125
9 45.0 7.0 45.0 45.0 7.0 7.0 7 45
In [ ]:
df[['bathrooms','bathrooms_text']].head(10)
Out[ ]:
bathrooms bathrooms_text
0 NaN 1 bath
1 NaN 1 shared bath
2 NaN 1.5 baths
3 NaN NaN
4 NaN 1 bath
5 NaN 1 bath
6 NaN 1 shared bath
7 NaN 1 shared bath
8 NaN 1 bath
9 NaN 1 shared bath
In [ ]:
df[['host_total_listings_count','host_listings_count']].head(10)
Out[ ]:
host_total_listings_count host_listings_count
0 9.0 6.0
1 5.0 1.0
2 5.0 1.0
3 2.0 2.0
4 1.0 1.0
5 1.0 1.0
6 4.0 1.0
7 2.0 2.0
8 2.0 2.0
9 2.0 2.0

Lets start with dropping columns.¶

All the columns with URL host_picture_url,host_has_profile_pic,host_url,picture_url,host_thumbnail_url,listing_url because we wont need it.

bathrooms and bathrooms_text is one and the same and lets use only one.

host_total_listings_count and host_listings_count is almost unique, so lets drop one.

Few of the unnecessary columns can also be dropped such as : scrape_id, last_scraped,host_verifications, host_identity_verified.

maximum_nights_avg_ntm,minimum_nights_avg_ntm,maximum_maximum_nights,minimum_maximum_nights,maximum_minimum_nights,minimum_minimum_nights,minimum_nights,maximum_nights.

It doesnt makes sense to keep all of this. Hence we will be keeping minimum_nights and maximum_nights only.

In [ ]:
drop_cols = ['host_picture_url','host_has_profile_pic','host_url','picture_url','host_thumbnail_url','listing_url','bathrooms','host_total_listings_count',
             'scrape_id','last_scraped','host_verifications','host_identity_verified','maximum_nights_avg_ntm','minimum_nights_avg_ntm',
             'maximum_maximum_nights','minimum_maximum_nights','maximum_minimum_nights','minimum_minimum_nights']
df_airbnb = df.drop(drop_cols, axis=1)
df_airbnb.shape
Out[ ]:
(41533, 57)
In [ ]:
df_airbnb.shape
Out[ ]:
(41533, 57)

Additional columns that need to be dropped.¶

In [ ]:
df_airbnb
Out[ ]:
id source name description neighborhood_overview host_id host_name host_since host_location host_about ... review_scores_communication review_scores_location review_scores_value license instant_bookable calculated_host_listings_count calculated_host_listings_count_entire_homes calculated_host_listings_count_private_rooms calculated_host_listings_count_shared_rooms reviews_per_month
0 2595 city scrape Skylit Midtown Castle Beautiful, spacious skylit studio in the heart... Centrally located in the heart of Manhattan ju... 2845 Jennifer 2008-09-09 New York, NY A New Yorker since (Phone number hidden by Air... ... 4.80 4.81 4.40 NaN f 3 3 0 0 0.31
1 5203 previous scrape Cozy Clean Guest Room - Family Apt Our best guests are seeking a safe, clean, spa... Our neighborhood is full of restaurants and ca... 7490 MaryEllen 2009-02-05 New York, NY Welcome to family life with my oldest two away... ... 4.95 4.94 4.92 NaN f 1 0 1 0 0.73
2 5136 city scrape Spacious Brooklyn Duplex, Patio + Garden We welcome you to stay in our lovely 2 br dupl... NaN 7378 Rebecca 2009-02-03 New York, NY Rebecca is an artist/designer, and Henoch is i... ... 5.00 4.67 5.00 NaN f 1 1 0 0 0.03
3 5121 city scrape BlissArtsSpace! One room available for rent in a 2 bedroom apt... NaN 7356 Garon 2009-02-03 New York, NY I am an artist(painter, filmmaker) and curato... ... 4.91 4.47 4.52 NaN f 2 0 2 0 0.30
4 6848 city scrape Only 2 stops to Manhattan studio Comfortable studio apartment with super comfor... NaN 15991 Allen & Irina 2009-05-06 New York, NY We love to travel. When we travel we like to s... ... 4.80 4.67 4.56 NaN f 1 1 0 0 1.13
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
41528 772683159414917117 city scrape Dahiari Desconecta de tus preocupaciones en este espac... NaN 125534010 Larissa 2017-04-12 Dominican Republic NaN ... NaN NaN NaN NaN f 1 1 0 0 NaN
41529 772705452516314073 city scrape Beautiful Basement Your family will be close to everything when y... NaN 338424773 Md 2020-02-24 NaN NaN ... NaN NaN NaN NaN t 1 0 1 0 NaN
41530 772710779275911753 city scrape Central Park Close By - 24 This is a Three-Bedroom Apartment. You will ha... NaN 2653479 Richard 2012-06-16 New York, NY I love to travel and meet people. ... NaN NaN NaN NaN t 37 2 35 0 NaN
41531 772714221060214808 city scrape Good Vibes at The Bronx Keep it simple at this peaceful and centrally-... NaN 421264574 Aridio 2021-09-02 NaN NaN ... NaN NaN NaN NaN t 1 1 0 0 NaN
41532 772716724205003579 city scrape 2 bedroom Condo near West Village This beautifully decorated condo will give you... NaN 481177884 Steven 2022-09-26 NaN NaN ... NaN NaN NaN NaN t 2 2 0 0 NaN

41533 rows × 57 columns

In [ ]:
df_airbnb.columns
Out[ ]:
Index(['id', 'source', 'name', 'description', 'neighborhood_overview',
       'host_id', 'host_name', 'host_since', 'host_location', 'host_about',
       'host_response_time', 'host_response_rate', 'host_acceptance_rate',
       'host_is_superhost', 'host_neighbourhood', 'host_listings_count',
       'neighbourhood', 'neighbourhood_cleansed',
       'neighbourhood_group_cleansed', 'latitude', 'longitude',
       'property_type', 'room_type', 'accommodates', 'bathrooms_text',
       'bedrooms', 'beds', 'amenities', 'price', 'minimum_nights',
       'maximum_nights', 'calendar_updated', 'has_availability',
       'availability_30', 'availability_60', 'availability_90',
       'availability_365', 'calendar_last_scraped', 'number_of_reviews',
       'number_of_reviews_ltm', 'number_of_reviews_l30d', 'first_review',
       'last_review', 'review_scores_rating', 'review_scores_accuracy',
       'review_scores_cleanliness', 'review_scores_checkin',
       'review_scores_communication', 'review_scores_location',
       'review_scores_value', 'license', 'instant_bookable',
       'calculated_host_listings_count',
       'calculated_host_listings_count_entire_homes',
       'calculated_host_listings_count_private_rooms',
       'calculated_host_listings_count_shared_rooms', 'reviews_per_month'],
      dtype='object')

Let's check again the missing value percentage count for each column.¶

In [ ]:
missing_values = df_airbnb.isnull().mean()*100
missing_values = missing_values.sort_values(ascending = False)
print(missing_values.to_markdown())
|                                              |           0 |
|:---------------------------------------------|------------:|
| calendar_updated                             | 100         |
| license                                      |  99.9976    |
| host_about                                   |  44.0902    |
| neighborhood_overview                        |  42.0003    |
| neighbourhood                                |  41.9979    |
| host_response_time                           |  32.8534    |
| host_response_rate                           |  32.8534    |
| host_acceptance_rate                         |  29.4007    |
| review_scores_location                       |  23.7113    |
| review_scores_value                          |  23.7113    |
| review_scores_checkin                        |  23.704     |
| review_scores_accuracy                       |  23.6944    |
| review_scores_communication                  |  23.6824    |
| review_scores_cleanliness                    |  23.6703    |
| first_review                                 |  22.6158    |
| review_scores_rating                         |  22.6158    |
| last_review                                  |  22.6158    |
| reviews_per_month                            |  22.6158    |
| host_neighbourhood                           |  19.7169    |
| host_location                                |  18.6478    |
| bedrooms                                     |   9.20232   |
| beds                                         |   2.26567   |
| description                                  |   1.89247   |
| bathrooms_text                               |   0.185395  |
| host_is_superhost                            |   0.069824  |
| name                                         |   0.0313004 |
| host_since                                   |   0.0120386 |
| host_name                                    |   0.0120386 |
| host_listings_count                          |   0.0120386 |
| number_of_reviews_l30d                       |   0         |
| calculated_host_listings_count_entire_homes  |   0         |
| instant_bookable                             |   0         |
| calculated_host_listings_count               |   0         |
| number_of_reviews                            |   0         |
| calculated_host_listings_count_private_rooms |   0         |
| calculated_host_listings_count_shared_rooms  |   0         |
| number_of_reviews_ltm                        |   0         |
| id                                           |   0         |
| calendar_last_scraped                        |   0         |
| accommodates                                 |   0         |
| host_id                                      |   0         |
| neighbourhood_cleansed                       |   0         |
| neighbourhood_group_cleansed                 |   0         |
| latitude                                     |   0         |
| longitude                                    |   0         |
| property_type                                |   0         |
| room_type                                    |   0         |
| amenities                                    |   0         |
| availability_365                             |   0         |
| source                                       |   0         |
| minimum_nights                               |   0         |
| maximum_nights                               |   0         |
| has_availability                             |   0         |
| availability_30                              |   0         |
| availability_60                              |   0         |
| availability_90                              |   0         |
| price                                        |   0         |

Dropping columns calendar_updated and license since these columns consists of almost null values only.¶

In [ ]:
drop_cols = ['calendar_updated', 'license']
df_airbnb = df_airbnb.drop(drop_cols, axis=1)
df_airbnb.shape 
Out[ ]:
(41533, 55)
In [ ]:
missing_values = df_airbnb.isnull().mean()*100
missing_values = missing_values.sort_values(ascending = False)
print(missing_values.to_markdown())
|                                              |          0 |
|:---------------------------------------------|-----------:|
| host_about                                   | 44.0902    |
| neighborhood_overview                        | 42.0003    |
| neighbourhood                                | 41.9979    |
| host_response_time                           | 32.8534    |
| host_response_rate                           | 32.8534    |
| host_acceptance_rate                         | 29.4007    |
| review_scores_location                       | 23.7113    |
| review_scores_value                          | 23.7113    |
| review_scores_checkin                        | 23.704     |
| review_scores_accuracy                       | 23.6944    |
| review_scores_communication                  | 23.6824    |
| review_scores_cleanliness                    | 23.6703    |
| review_scores_rating                         | 22.6158    |
| last_review                                  | 22.6158    |
| first_review                                 | 22.6158    |
| reviews_per_month                            | 22.6158    |
| host_neighbourhood                           | 19.7169    |
| host_location                                | 18.6478    |
| bedrooms                                     |  9.20232   |
| beds                                         |  2.26567   |
| description                                  |  1.89247   |
| bathrooms_text                               |  0.185395  |
| host_is_superhost                            |  0.069824  |
| name                                         |  0.0313004 |
| host_listings_count                          |  0.0120386 |
| host_since                                   |  0.0120386 |
| host_name                                    |  0.0120386 |
| number_of_reviews_l30d                       |  0         |
| number_of_reviews_ltm                        |  0         |
| calculated_host_listings_count_entire_homes  |  0         |
| instant_bookable                             |  0         |
| calculated_host_listings_count               |  0         |
| calendar_last_scraped                        |  0         |
| calculated_host_listings_count_private_rooms |  0         |
| calculated_host_listings_count_shared_rooms  |  0         |
| number_of_reviews                            |  0         |
| id                                           |  0         |
| availability_365                             |  0         |
| availability_90                              |  0         |
| host_id                                      |  0         |
| neighbourhood_cleansed                       |  0         |
| neighbourhood_group_cleansed                 |  0         |
| latitude                                     |  0         |
| longitude                                    |  0         |
| property_type                                |  0         |
| room_type                                    |  0         |
| accommodates                                 |  0         |
| source                                       |  0         |
| price                                        |  0         |
| minimum_nights                               |  0         |
| maximum_nights                               |  0         |
| has_availability                             |  0         |
| availability_30                              |  0         |
| availability_60                              |  0         |
| amenities                                    |  0         |
In [ ]:
df_airbnb.shape
Out[ ]:
(41533, 55)

Let's look into the neighborhood column and see what might help us in predicting the price¶

In [ ]:
df_airbnb[['host_location', 'host_neighbourhood', 'neighbourhood', 'neighbourhood_cleansed',  
    'neighbourhood_group_cleansed']].head(10)
Out[ ]:
host_location host_neighbourhood neighbourhood neighbourhood_cleansed neighbourhood_group_cleansed
0 New York, NY Midtown New York, United States Midtown Manhattan
1 New York, NY Upper West Side New York, United States Upper West Side Manhattan
2 New York, NY Greenwood Heights NaN Sunset Park Brooklyn
3 New York, NY Bedford-Stuyvesant NaN Bedford-Stuyvesant Brooklyn
4 New York, NY Williamsburg NaN Williamsburg Brooklyn
5 New York, NY Hell's Kitchen New York, United States Midtown Manhattan
6 New York, NY East Harlem New York, United States East Harlem Manhattan
7 New York, NY East Harlem New York, United States East Harlem Manhattan
8 New York, NY Fort Greene Brooklyn, New York, United States Fort Greene Brooklyn
9 New York, NY Williamsburg Brooklyn, New York, United States Williamsburg Brooklyn

Since 'host_neighbourhood' and 'neighbourhood_cleansed' seem to be giving the same information and neighbourhood_cleansed has no NaN values we will keep that and drop host_neighbourhood.¶

In [ ]:
drop_cols = ['host_neighbourhood']
df_airbnb = df_airbnb.drop(drop_cols, axis=1)
df_airbnb.shape
Out[ ]:
(41533, 54)
In [ ]:
df_airbnb[['calculated_host_listings_count_private_rooms','calculated_host_listings_count_shared_rooms', 'calculated_host_listings_count_entire_homes', 'calculated_host_listings_count']]
Out[ ]:
calculated_host_listings_count_private_rooms calculated_host_listings_count_shared_rooms calculated_host_listings_count_entire_homes calculated_host_listings_count
0 0 0 3 3
1 1 0 0 1
2 0 0 1 1
3 2 0 0 2
4 0 0 1 1
... ... ... ... ...
41528 0 0 1 1
41529 1 0 0 1
41530 35 0 2 37
41531 0 0 1 1
41532 0 0 2 2

41533 rows × 4 columns

In [ ]:
df_airbnb.calculated_host_listings_count_shared_rooms.value_counts()
Out[ ]:
0     40706
1       464
2       112
3       111
15       32
8        24
5        24
4        24
10       16
9        13
7         7
Name: calculated_host_listings_count_shared_rooms, dtype: int64

calculated_host_listings_count_shared_rooms seems to have a lot zeroes which in case means the host never listed a shared room.

We will keep that for now it might help

host_response_rate, host_acceptance_rate has % sign that needs to be removed.¶

In [ ]:
df_airbnb[['host_response_rate', 'host_acceptance_rate']]
Out[ ]:
host_response_rate host_acceptance_rate
0 72% 22%
1 NaN NaN
2 NaN 50%
3 90% 82%
4 100% 100%
... ... ...
41528 100% 100%
41529 100% 100%
41530 98% 86%
41531 NaN NaN
41532 83% 100%

41533 rows × 2 columns

price has "$" and "," that needs to be removed.¶

In [ ]:
df_airbnb[['price']]
Out[ ]:
price
0 $175.00
1 $75.00
2 $275.00
3 $60.00
4 $68.00
... ...
41528 $105.00
41529 $87.00
41530 $70.00
41531 $125.00
41532 $1,114.00

41533 rows × 1 columns

Datetime columns that we will change to weeks for feature engineering in the next step.¶

'first_review', 'last_review', 'host_since'

In [ ]:
df_airbnb['first_review'] = pd.to_datetime(df_airbnb['first_review'])
In [ ]:
df_airbnb['last_review'] = pd.to_datetime(df_airbnb['last_review'])
In [ ]:
df_airbnb['host_since'] = pd.to_datetime(df_airbnb['host_since'])
In [ ]:
df_airbnb[['first_review', 'last_review', 'host_since']]
Out[ ]:
first_review last_review host_since
0 2009-11-21 2022-06-21 2008-09-09
1 2009-09-07 2017-07-21 2009-02-05
2 2014-01-02 2022-08-10 2009-02-03
3 2009-05-28 2019-12-02 2009-02-03
4 2009-05-25 2022-11-02 2009-05-06
... ... ... ...
41528 NaT NaT 2017-04-12
41529 NaT NaT 2020-02-24
41530 NaT NaT 2012-06-16
41531 NaT NaT 2021-09-02
41532 NaT NaT 2022-09-26

41533 rows × 3 columns

In [ ]:
def remove_signs(X):
    # list of columns that we need to run strip function on 
    cols1 = ['host_response_rate', 'host_acceptance_rate'] 
    cols2 = ['price']
    # iterating over all the columns in the list 
    for col in cols1:
        X[col] = X[col].str.strip('%')
    for col in cols2:
        X[col] = X[col].str.strip('$')
    for col in cols2:
        X[col] = X[col].str.replace(',','')
    
    # list of datetime columns that we will change to weeks for feature engineering in the next step
    cols3 = ['first_review', 'last_review', 'host_since']
     # iterating over all the columns in the list
    for col in cols3:
        X[col] = X[col].dt.week
    
    # change dtype of 'price'z to float
    for col in cols2:
        X[col] = X[col].astype(float)
        
    return X

# passing our dataframe as the argument 
df_airbnb = remove_signs(df_airbnb)
<ipython-input-30-8139a14801d6>:17: FutureWarning: Series.dt.weekofyear and Series.dt.week have been deprecated. Please use Series.dt.isocalendar().week instead.
  X[col] = X[col].dt.week
<ipython-input-30-8139a14801d6>:17: FutureWarning: Series.dt.weekofyear and Series.dt.week have been deprecated. Please use Series.dt.isocalendar().week instead.
  X[col] = X[col].dt.week
<ipython-input-30-8139a14801d6>:17: FutureWarning: Series.dt.weekofyear and Series.dt.week have been deprecated. Please use Series.dt.isocalendar().week instead.
  X[col] = X[col].dt.week
In [ ]:
df_airbnb
Out[ ]:
id source name description neighborhood_overview host_id host_name host_since host_location host_about ... review_scores_checkin review_scores_communication review_scores_location review_scores_value instant_bookable calculated_host_listings_count calculated_host_listings_count_entire_homes calculated_host_listings_count_private_rooms calculated_host_listings_count_shared_rooms reviews_per_month
0 2595 city scrape Skylit Midtown Castle Beautiful, spacious skylit studio in the heart... Centrally located in the heart of Manhattan ju... 2845 Jennifer 37.0 New York, NY A New Yorker since (Phone number hidden by Air... ... 4.77 4.80 4.81 4.40 f 3 3 0 0 0.31
1 5203 previous scrape Cozy Clean Guest Room - Family Apt Our best guests are seeking a safe, clean, spa... Our neighborhood is full of restaurants and ca... 7490 MaryEllen 6.0 New York, NY Welcome to family life with my oldest two away... ... 4.97 4.95 4.94 4.92 f 1 0 1 0 0.73
2 5136 city scrape Spacious Brooklyn Duplex, Patio + Garden We welcome you to stay in our lovely 2 br dupl... NaN 7378 Rebecca 6.0 New York, NY Rebecca is an artist/designer, and Henoch is i... ... 5.00 5.00 4.67 5.00 f 1 1 0 0 0.03
3 5121 city scrape BlissArtsSpace! One room available for rent in a 2 bedroom apt... NaN 7356 Garon 6.0 New York, NY I am an artist(painter, filmmaker) and curato... ... 4.91 4.91 4.47 4.52 f 2 0 2 0 0.30
4 6848 city scrape Only 2 stops to Manhattan studio Comfortable studio apartment with super comfor... NaN 15991 Allen & Irina 19.0 New York, NY We love to travel. When we travel we like to s... ... 4.84 4.80 4.67 4.56 f 1 1 0 0 1.13
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
41528 772683159414917117 city scrape Dahiari Desconecta de tus preocupaciones en este espac... NaN 125534010 Larissa 15.0 Dominican Republic NaN ... NaN NaN NaN NaN f 1 1 0 0 NaN
41529 772705452516314073 city scrape Beautiful Basement Your family will be close to everything when y... NaN 338424773 Md 9.0 NaN NaN ... NaN NaN NaN NaN t 1 0 1 0 NaN
41530 772710779275911753 city scrape Central Park Close By - 24 This is a Three-Bedroom Apartment. You will ha... NaN 2653479 Richard 24.0 New York, NY I love to travel and meet people. ... NaN NaN NaN NaN t 37 2 35 0 NaN
41531 772714221060214808 city scrape Good Vibes at The Bronx Keep it simple at this peaceful and centrally-... NaN 421264574 Aridio 35.0 NaN NaN ... NaN NaN NaN NaN t 1 1 0 0 NaN
41532 772716724205003579 city scrape 2 bedroom Condo near West Village This beautifully decorated condo will give you... NaN 481177884 Steven 39.0 NaN NaN ... NaN NaN NaN NaN t 2 2 0 0 NaN

41533 rows × 54 columns

In [ ]:
df_airbnb[['host_response_rate', 'host_acceptance_rate']]
Out[ ]:
host_response_rate host_acceptance_rate
0 72 22
1 NaN NaN
2 NaN 50
3 90 82
4 100 100
... ... ...
41528 100 100
41529 100 100
41530 98 86
41531 NaN NaN
41532 83 100

41533 rows × 2 columns

In [ ]:
df_airbnb[['price']]
Out[ ]:
price
0 175.0
1 75.0
2 275.0
3 60.0
4 68.0
... ...
41528 105.0
41529 87.0
41530 70.0
41531 125.0
41532 1114.0

41533 rows × 1 columns

In [ ]:
# Checking null values count columnwise
total_nan = df_airbnb.isna().sum().sort_values(ascending=False)
percentage_nan = (total_nan / df_airbnb.shape[0]) * 100
tabel = pd.concat([total_nan, percentage_nan], axis=1, keys=['Total NaN values', 'Percentage of NaN values'])
tabel
Out[ ]:
Total NaN values Percentage of NaN values
host_about 18312 44.090241
neighborhood_overview 17444 42.000337
neighbourhood 17443 41.997929
host_response_time 13645 32.853394
host_response_rate 13645 32.853394
host_acceptance_rate 12211 29.400718
review_scores_location 9848 23.711266
review_scores_value 9848 23.711266
review_scores_checkin 9845 23.704043
review_scores_accuracy 9841 23.694412
review_scores_communication 9836 23.682373
review_scores_cleanliness 9831 23.670334
review_scores_rating 9393 22.615751
last_review 9393 22.615751
first_review 9393 22.615751
reviews_per_month 9393 22.615751
host_location 7745 18.647822
bedrooms 3822 9.202321
beds 941 2.265668
description 786 1.892471
bathrooms_text 77 0.185395
host_is_superhost 29 0.069824
name 13 0.031300
host_since 5 0.012039
host_name 5 0.012039
host_listings_count 5 0.012039
number_of_reviews_l30d 0 0.000000
number_of_reviews_ltm 0 0.000000
calculated_host_listings_count_entire_homes 0 0.000000
instant_bookable 0 0.000000
calculated_host_listings_count 0 0.000000
calendar_last_scraped 0 0.000000
calculated_host_listings_count_private_rooms 0 0.000000
calculated_host_listings_count_shared_rooms 0 0.000000
number_of_reviews 0 0.000000
id 0 0.000000
availability_365 0 0.000000
availability_90 0 0.000000
host_id 0 0.000000
neighbourhood_cleansed 0 0.000000
neighbourhood_group_cleansed 0 0.000000
latitude 0 0.000000
longitude 0 0.000000
property_type 0 0.000000
room_type 0 0.000000
accommodates 0 0.000000
amenities 0 0.000000
source 0 0.000000
minimum_nights 0 0.000000
maximum_nights 0 0.000000
has_availability 0 0.000000
availability_30 0 0.000000
availability_60 0 0.000000
price 0 0.000000

Dealing with High Cardinality attribute : property_type¶

In [ ]:
df_airbnb['property_type'].value_counts()
Out[ ]:
Entire rental unit                    17579
Private room in rental unit           10995
Private room in home                   2198
Entire condo                           1690
Entire home                            1568
                                      ...  
Private room in religious building        1
Private room in tent                      1
Private room in dorm                      1
Private room in farm stay                 1
Shared room in shepherd's hut             1
Name: property_type, Length: 80, dtype: int64
In [ ]:
# For property_type, let's consider top 15 types and put rest under Others.

# List of top 15 values
top15 = df_airbnb['property_type'].value_counts()[:15].index

# if not in top 15 put it under 'Other'
df_airbnb.loc[~df_airbnb['property_type'].isin(top15), 'property_type'] = 'Other'
In [ ]:
df_airbnb['property_type'].value_counts()
Out[ ]:
Entire rental unit                    17579
Private room in rental unit           10995
Private room in home                   2198
Entire condo                           1690
Entire home                            1568
Other                                  1400
Private room in townhouse              1098
Room in hotel                           936
Entire loft                             760
Entire townhouse                        684
Private room in condo                   610
Entire serviced apartment               462
Room in boutique hotel                  438
Entire guest suite                      383
Shared room in rental unit              382
Private room in serviced apartment      350
Name: property_type, dtype: int64
In [ ]:
df_airbnb['neighbourhood'].value_counts()
Out[ ]:
New York, United States                        9649
Brooklyn, New York, United States              8997
Queens, New York, United States                3490
Bronx, New York, United States                  491
The Bronx, New York, United States              392
                                               ... 
ozone park queens , New York, United States       1
Jamaica , ny, United States                       1
New York, New York , United States                1
 Crown Heights,NY, New York, United States        1
Valley Stream, New York, United States            1
Name: neighbourhood, Length: 193, dtype: int64
In [ ]:
df_airbnb['neighbourhood_cleansed'].value_counts()
Out[ ]:
Bedford-Stuyvesant    2936
Williamsburg          2570
Harlem                1949
Midtown               1918
Bushwick              1752
                      ... 
Woodrow                  1
Bull's Head              1
Westerleigh              1
New Dorp                 1
Hollis Hills             1
Name: neighbourhood_cleansed, Length: 223, dtype: int64

Outliers¶

Finding skewness of the following attributes.

Data Visualization

In [ ]:
from pandas.plotting import scatter_matrix
In [ ]:
# Trying to figure out different columns that seems to affect our target variable 'Price'
cols1 = ['price','bathrooms_text', 'bedrooms', 
        'beds', 'accommodates', 'reviews_per_month']

plt.figure(dpi=500, facecolor = '#dadada')

scatter_matrix(df_airbnb[cols1], alpha=0.4, figsize=(21,17))

plt.savefig(r"figure_1.png")
plt.show()
<Figure size 3000x2000 with 0 Axes>
In [ ]:
cols = ['price','longitude', 'latitude', 'bedrooms', 
        'reviews_per_month', 'neighbourhood']


scatter_matrix(df_airbnb[cols], alpha=0.4, figsize=(21,17))
plt.savefig(r"figure_2.png")
plt.show()
In [ ]:
cols = ['longitude', 'latitude', 'price',  'bedrooms', 'beds', 'accommodates']
def finding_skewness():
    for col in cols:
        print(f'{col} has a skewness of {df_airbnb[col].skew(skipna = True)}')

finding_skewness()
longitude has a skewness of 1.1936961560151862
latitude has a skewness of 0.21193776627988625
price has a skewness of 78.22503185691421
bedrooms has a skewness of 2.6434155698521207
beds has a skewness of 3.971568711806406
accommodates has a skewness of 2.589019820699697
In [ ]:
# to set the facecolor
plt.figure(dpi=250, facecolor = '#dadada')

sns.boxplot('price', data=df_airbnb, palette='Blues')
# plt.ylim(0,5000)# Remove the splines 
plt.gca().spines["top"].set_visible(False)
plt.gca().spines["bottom"].set_visible(False)
plt.gca().spines["right"].set_visible(False)
plt.gca().spines["left"].set_visible(False)
        
plt.savefig(r"box2.png")

plt.show()
---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-44-e20d2be84cd7> in <module>
      2 plt.figure(dpi=250, facecolor = '#dadada')
      3 
----> 4 sns.boxplot('price', data=df_airbnb, palette='Blues')
      5 # plt.ylim(0,5000)# Remove the splines
      6 plt.gca().spines["top"].set_visible(False)

TypeError: boxplot() got multiple values for argument 'data'
<Figure size 1500x1000 with 0 Axes>
In [ ]:
plt.figure(dpi=250, facecolor = '#dadada')
# by limitting the x axis we are no able to see the box
sns.boxplot('price', data=df_airbnb, palette='Blues')
plt.xlim(0,1000)

# Remove the splines 
plt.gca().spines["top"].set_visible(False)
plt.gca().spines["bottom"].set_visible(False)
plt.gca().spines["right"].set_visible(False)
plt.gca().spines["left"].set_visible(False)

plt.savefig(r"box1.png")

plt.show()
---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-45-339292bdc3a9> in <module>
      1 plt.figure(dpi=250, facecolor = '#dadada')
      2 # by limitting the x axis we are no able to see the box
----> 3 sns.boxplot('price', data=df_airbnb, palette='Blues')
      4 plt.xlim(0,1000)
      5 

TypeError: boxplot() got multiple values for argument 'data'
<Figure size 1500x1000 with 0 Axes>

It is very hard ot believe that airbnb prices can be as high as 10000 hence we will be using only prices that are between 20$-1000$

In [ ]:
# to get rid of the outliers and to bring down the skewness we will only use price below 1000 and above 24

df_airbnb = df_airbnb.loc[(df_airbnb.price < 1000) & (df_airbnb.price > 24)]
In [ ]:
# checking how much did we control the skewness on price 
df_airbnb.price.skew()
Out[ ]:
2.404036150322895

We were able to bring down the skewness of our target variable down to 2.4 from 13.4 and the rest we will take care by applying log later.

In [ ]:
def plotting_to_check_skewness():
    for col in ['price']:
        # to set the facecolor
        plt.figure(dpi=500, facecolor = '#dadada')
        # setting the limit on the x axis to be able to visualize as we have a big outliers
        plt.xlim(0, 700)
        
        sns.distplot(df_airbnb[col], kde=True, bins='auto')
        
        # Remove the splines 
        plt.gca().spines["top"].set_visible(False)
        plt.gca().spines["bottom"].set_visible(False)
        plt.gca().spines["right"].set_visible(False)
        plt.gca().spines["left"].set_visible(False)

        plt.tight_layout() # Makes it better looking specially on laptops

        # to save the fig
        plt.savefig('skew.png',bbox_inches='tight', dpi=500, facecolor = '#dadada')

        plt.show()
        
plotting_to_check_skewness()
<ipython-input-48-452f7c12176b>:8: UserWarning: 

`distplot` is a deprecated function and will be removed in seaborn v0.14.0.

Please adapt your code to use either `displot` (a figure-level function with
similar flexibility) or `histplot` (an axes-level function for histograms).

For a guide to updating your code to use the new functions, please see
https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751

  sns.distplot(df_airbnb[col], kde=True, bins='auto')
In [ ]:
# Resetting the index as we deleted some rows 
df_airbnb.reset_index(drop=True, inplace=True)
In [ ]:
airbnb_corr = df_airbnb.corr()
airbnb_corr.price.sort_values(ascending=False)
Out[ ]:
price                                           1.000000
accommodates                                    0.464287
bedrooms                                        0.388907
beds                                            0.360864
host_listings_count                             0.163592
calculated_host_listings_count_entire_homes     0.141445
availability_90                                 0.139171
availability_60                                 0.138653
availability_30                                 0.132365
availability_365                                0.124052
review_scores_location                          0.115962
id                                              0.105374
review_scores_cleanliness                       0.095617
reviews_per_month                               0.087290
host_id                                         0.084607
last_review                                     0.075268
calculated_host_listings_count                  0.069573
number_of_reviews_ltm                           0.058225
review_scores_rating                            0.056165
latitude                                        0.040055
number_of_reviews_l30d                          0.036037
review_scores_accuracy                          0.016967
host_since                                      0.013988
first_review                                    0.013575
review_scores_communication                     0.009955
review_scores_checkin                           0.003006
maximum_nights                                 -0.001487
review_scores_value                            -0.004297
number_of_reviews                              -0.011327
calculated_host_listings_count_shared_rooms    -0.048729
calculated_host_listings_count_private_rooms   -0.083295
minimum_nights                                 -0.114291
longitude                                      -0.239916
Name: price, dtype: float64
In [ ]:
# set heatmap size
plt.figure(figsize= (22,13), dpi=250) 
# create heatmap using seaborn
cbar_kws = {"shrink":.8,
           'extend':'max',
           'extendfrac':.2, 
           "drawedges":True}

sns.heatmap(airbnb_corr, vmin = -1, vmax = 1, cmap="coolwarm", annot = True, annot_kws={'size': 10}, linewidth = 1, cbar_kws=cbar_kws)

plt.savefig(r'heat.png',bbox_inches='tight', dpi=250, facecolor = '#dadada')

plt.show()

Looking at the heatmap we can eliminate features that highly correlated between each other as they will not add anymore value in the model.

  • one of the calculated_host_listing_count or host_listings_count could be deleted as they are highly correalted between eachother.
  • one of the calculated_host_listings_count_entire_homes or host_listings_count could be deleted as they are highly correalted between eachother.
  • one of the reviews_per_month or number_of_reviews_ltm could be deleted as they are highly correalted between eachother.
  • two of the three availability_30, availability_60, availability_90 could also be deleted.
In [ ]:
drop_cols4 = ['number_of_reviews_ltm', 'availability_60', 'availability_90', 'calculated_host_listings_count_entire_homes']

df_airbnb.drop(drop_cols4, axis=1, inplace=True)
<ipython-input-52-98f1c63ea659>:3: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_airbnb.drop(drop_cols4, axis=1, inplace=True)
In [ ]:
plt.figure(dpi=250, facecolor = '#dadada')
# histogram of the bedrooms attribute
plt.hist(df_airbnb['bedrooms'])

plt.xlim(0,8)

# Remove the splines 
plt.gca().spines["top"].set_visible(False)
plt.gca().spines["bottom"].set_visible(False)
plt.gca().spines["right"].set_visible(False)
plt.gca().spines["left"].set_visible(False)

plt.title('Bedrooms')
plt.savefig(r"box5.png")

plt.show()
In [ ]:
# Filling the NaN values using front fill method
df_airbnb['bedrooms'] = df_airbnb['bedrooms'].fillna(method='ffill')

# Made a new column with 4 bins each with good amount of instances  
df_airbnb['bedroom'] = pd.cut(df_airbnb['bedrooms'],
                               bins=[0., 1, 2, np.inf],
                               labels=[1, 2, 3])


df_airbnb['bedroom'] = df_airbnb['bedroom'].fillna(method='ffill')
<ipython-input-54-368bcc9ff685>:2: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_airbnb['bedrooms'] = df_airbnb['bedrooms'].fillna(method='ffill')
<ipython-input-54-368bcc9ff685>:5: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_airbnb['bedroom'] = pd.cut(df_airbnb['bedrooms'],
<ipython-input-54-368bcc9ff685>:10: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_airbnb['bedroom'] = df_airbnb['bedroom'].fillna(method='ffill')
In [ ]:
# histogram plot of our newly created column

plt.figure(dpi=250, facecolor = '#dadada')
# histogram of the bedrooms attribute
plt.hist(df_airbnb['bedroom'])

plt.xlim(0,4)

# Remove the splines 
plt.gca().spines["top"].set_visible(False)
plt.gca().spines["bottom"].set_visible(False)
plt.gca().spines["right"].set_visible(False)
plt.gca().spines["left"].set_visible(False)

plt.savefig(r"box7.png")

plt.title('Bedrooms')
plt.show()
In [ ]:
# Filling the NaN values using front fill method
df_airbnb['bedrooms'] = df_airbnb['bedrooms'].fillna(method='ffill')

# Made a new column with 4 bins each with good amount of instances  
df_airbnb['bedroom'] = pd.cut(df_airbnb['bedrooms'],
                               bins=[0., 1, 2, np.inf],
                               labels=[1, 2, 3])


df_airbnb['bedroom'] = df_airbnb['bedroom'].fillna(method='ffill')
<ipython-input-56-368bcc9ff685>:2: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_airbnb['bedrooms'] = df_airbnb['bedrooms'].fillna(method='ffill')
<ipython-input-56-368bcc9ff685>:5: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_airbnb['bedroom'] = pd.cut(df_airbnb['bedrooms'],
<ipython-input-56-368bcc9ff685>:10: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_airbnb['bedroom'] = df_airbnb['bedroom'].fillna(method='ffill')
In [ ]:
df_airbnb.drop(0,axis=0,inplace=True)
<ipython-input-57-6d7551621bfc>:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_airbnb.drop(0,axis=0,inplace=True)
In [ ]:
df_airbnb['bedrooms'].isna().sum()
Out[ ]:
0
In [ ]:
# histogram plot of our newly created column

plt.figure(dpi=250, facecolor = '#dadada')
# histogram of the bedrooms attribute
plt.hist(df_airbnb['bedroom'])

plt.xlim(0,4)

# Remove the splines 
plt.gca().spines["top"].set_visible(False)
plt.gca().spines["bottom"].set_visible(False)
plt.gca().spines["right"].set_visible(False)
plt.gca().spines["left"].set_visible(False)

plt.savefig(r"box7.png")

plt.title('Bedrooms')
plt.show()
In [ ]:
# Replacing columns with f/t with 0/1
df_airbnb.replace({'f': 0, 't': 1}, inplace=True)

# Plotting the distribution of numerical and boolean categories
df_airbnb.hist(figsize=(20,20));
<ipython-input-60-4ab2edf398b7>:2: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_airbnb.replace({'f': 0, 't': 1}, inplace=True)
In [ ]:
import warnings
warnings.filterwarnings("ignore")
In [ ]:
df_airbnb.last_review = pd.to_datetime(df_airbnb.last_review) # Converting to datetime

# Calculating the number of days between the first review and the date the data was scraped
df_airbnb['time_since_first_review'] = (pd.datetime(2019, 4, 9) - df_airbnb.last_review).astype('timedelta64[D]')
In [ ]:
# Distribution of the number of days since first review
df_airbnb.first_review.hist(figsize=(15,5), bins=30);
In [ ]:
def bin_column(col, bins, labels, na_label='unknown'):
    """
    Takes in a column name, bin cut points and labels, replaces the original column with a
    binned version, and replaces nulls (with 'unknown' if unspecified).
    """
    df_airbnb[col] = pd.cut(df[col], bins=bins, labels=labels, include_lowest=True)
    df_airbnb[col] = df_airbnb[col].astype('str')
    df_airbnb[col].fillna(na_label, inplace=True)
In [ ]:
 
In [ ]:
# Checking the distributions of the review ratings columns
variables_to_plot = list(df_airbnb.columns[df_airbnb.columns.str.startswith("review_scores") == True])
fig = plt.figure(figsize=(12,8))
for i, var_name in enumerate(variables_to_plot):
    ax = fig.add_subplot(3,3,i+1)
    df[var_name].hist(bins=10,ax=ax)
    ax.set_title(var_name)
fig.tight_layout()
plt.show()
In [ ]:
df_airbnb['host_since'] = pd.to_datetime(df['host_since'])
df_airbnb['first_review'] = pd.to_datetime(df['first_review'])
In [ ]:
print("Average number of listings per host per year on Airbnb in New York:")
print(round(df_airbnb.set_index('host_since').host_listings_count.resample('YS').mean(),2))
Average number of listings per host per year on Airbnb in New York:
host_since
2008-01-01      1.28
2009-01-01      7.74
2010-01-01     28.84
2011-01-01     26.64
2012-01-01     50.76
2013-01-01     54.81
2014-01-01     46.98
2015-01-01     61.10
2016-01-01     77.26
2017-01-01     88.60
2018-01-01    138.41
2019-01-01     81.05
2020-01-01    152.84
2021-01-01    115.33
2022-01-01    125.43
Name: host_listings_count, dtype: float64
In [ ]:
# List of the largest host_listings_count and the year the host joined Airbnb
df.sort_values('host_listings_count').drop_duplicates('host_listings_count',keep='last').tail(10)[['host_since', 'host_listings_count']]
Out[ ]:
host_since host_listings_count
10678 2015-04-14 564.0
38437 2014-12-23 574.0
25495 2020-10-26 715.0
38587 2022-07-11 767.0
31621 2019-04-26 1459.0
20661 2015-11-02 1519.0
33444 2018-02-22 2250.0
28370 2013-03-25 2648.0
15845 2016-12-16 4559.0
4953 NaN NaN
In [ ]:
df_airbnb.head(10)
Out[ ]:
id source name description neighborhood_overview host_id host_name host_since host_location host_about ... review_scores_communication review_scores_location review_scores_value instant_bookable calculated_host_listings_count calculated_host_listings_count_private_rooms calculated_host_listings_count_shared_rooms reviews_per_month bedroom time_since_first_review
1 5203 previous scrape Cozy Clean Guest Room - Family Apt Our best guests are seeking a safe, clean, spa... Our neighborhood is full of restaurants and ca... 7490 MaryEllen 2009-02-05 New York, NY Welcome to family life with my oldest two away... ... 4.95 4.94 4.92 0 1 1 0 0.73 1 17994.0
2 5136 city scrape Spacious Brooklyn Duplex, Patio + Garden We welcome you to stay in our lovely 2 br dupl... NaN 7378 Rebecca 2009-02-03 New York, NY Rebecca is an artist/designer, and Henoch is i... ... 5.00 4.67 5.00 0 1 0 0 0.03 2 17994.0
3 5121 city scrape BlissArtsSpace! One room available for rent in a 2 bedroom apt... NaN 7356 Garon 2009-02-03 New York, NY I am an artist(painter, filmmaker) and curato... ... 4.91 4.47 4.52 0 2 2 0 0.30 1 17994.0
4 6848 city scrape Only 2 stops to Manhattan studio Comfortable studio apartment with super comfor... NaN 15991 Allen & Irina 2009-05-06 New York, NY We love to travel. When we travel we like to s... ... 4.80 4.67 4.56 0 1 0 0 1.13 1 17994.0
5 5178 city scrape Large Furnished Room Near B'way Please don’t expect the luxury here just a bas... Theater district, many restaurants around here. 8967 Shunichi 2009-03-03 New York, NY I used to work for a financial industry but no... ... 4.45 4.88 4.39 0 1 1 0 3.38 1 17994.0
6 6990 city scrape UES Beautiful Blue Room Beautiful peaceful healthy home<br /><br /><b>... Location: Five minutes to Central Park, Museum... 16800 Cyn 2009-05-12 New York, NY Capturing the Steinbeck side of life in its Fi... ... 4.95 4.84 4.85 1 1 1 0 1.52 1 17994.0
7 6872 city scrape Uptown Sanctuary w/ Private Bath (Month to Month) A charming month-to-month home away from home ... This sweet Harlem sanctuary is a 10-20 minute ... 16104 Kae 2009-05-07 New York, NY A former life in fashion and wellness has left... ... 5.00 5.00 5.00 0 2 2 0 0.16 1 17994.0
8 7097 city scrape Perfect for Your Parents: Privacy + Garden Parents/grandparents coming to town, or just h... Residential, village-like atmosphere. Lots of ... 17571 Jane 2009-05-17 New York, NY I have been an Airbnb host since 2009 -- just ... ... 4.92 4.94 4.81 1 2 1 0 2.01 1 17994.0
9 7064 city scrape Amazing location! Wburg. Large, bright & tranquil Large, private loft-like room in a spacious 2-... - One stop from the East Village, Lower East S... 17297 Joelle 2009-05-15 New York, NY I have lived in the same apartment in Brooklyn... ... 5.00 5.00 5.00 0 2 2 0 0.09 1 17994.0
10 8490 city scrape Maison des Sirenes1,bohemian, luminous apartment <b>The space</b><br />I am the lucky owner of ... NaN 25183 Nathalie 2009-07-10 New York, NY I am French and have been living in Ny for 10... ... 4.87 4.65 4.75 0 2 0 0 1.01 1 17994.0

10 rows × 52 columns

In [ ]:
df_missing_2 = df_airbnb.isna().sum()
df_missing_2 = df_missing_2.sort_values(ascending = False)
print(df_missing_2.to_markdown())
|                                              |     0 |
|:---------------------------------------------|------:|
| host_about                                   | 17919 |
| neighbourhood                                | 17050 |
| neighborhood_overview                        | 17050 |
| host_response_time                           | 13429 |
| host_response_rate                           | 13429 |
| host_acceptance_rate                         | 11996 |
| review_scores_location                       |  9308 |
| review_scores_value                          |  9308 |
| review_scores_checkin                        |  9305 |
| review_scores_accuracy                       |  9301 |
| review_scores_communication                  |  9296 |
| review_scores_cleanliness                    |  9291 |
| time_since_first_review                      |  8859 |
| review_scores_rating                         |  8859 |
| last_review                                  |  8859 |
| reviews_per_month                            |  8859 |
| first_review                                 |  8481 |
| host_location                                |  7398 |
| beds                                         |   886 |
| description                                  |   746 |
| bathrooms_text                               |    45 |
| name                                         |    12 |
| host_listings_count                          |     5 |
| host_since                                   |     5 |
| host_name                                    |     5 |
| number_of_reviews_l30d                       |     0 |
| calculated_host_listings_count_shared_rooms  |     0 |
| instant_bookable                             |     0 |
| calculated_host_listings_count               |     0 |
| calculated_host_listings_count_private_rooms |     0 |
| calendar_last_scraped                        |     0 |
| bedroom                                      |     0 |
| number_of_reviews                            |     0 |
| id                                           |     0 |
| availability_365                             |     0 |
| availability_30                              |     0 |
| host_id                                      |     0 |
| host_is_superhost                            |     0 |
| neighbourhood_cleansed                       |     0 |
| neighbourhood_group_cleansed                 |     0 |
| latitude                                     |     0 |
| longitude                                    |     0 |
| property_type                                |     0 |
| room_type                                    |     0 |
| accommodates                                 |     0 |
| bedrooms                                     |     0 |
| source                                       |     0 |
| price                                        |     0 |
| minimum_nights                               |     0 |
| maximum_nights                               |     0 |
| has_availability                             |     0 |
| amenities                                    |     0 |
In [ ]:
df_airbnb
Out[ ]:
id source name description neighborhood_overview host_id host_name host_since host_location host_about ... review_scores_communication review_scores_location review_scores_value instant_bookable calculated_host_listings_count calculated_host_listings_count_private_rooms calculated_host_listings_count_shared_rooms reviews_per_month bedroom time_since_first_review
1 5203 previous scrape Cozy Clean Guest Room - Family Apt Our best guests are seeking a safe, clean, spa... Our neighborhood is full of restaurants and ca... 7490 MaryEllen 2009-02-05 New York, NY Welcome to family life with my oldest two away... ... 4.95 4.94 4.92 0 1 1 0 0.73 1 17994.0
2 5136 city scrape Spacious Brooklyn Duplex, Patio + Garden We welcome you to stay in our lovely 2 br dupl... NaN 7378 Rebecca 2009-02-03 New York, NY Rebecca is an artist/designer, and Henoch is i... ... 5.00 4.67 5.00 0 1 0 0 0.03 2 17994.0
3 5121 city scrape BlissArtsSpace! One room available for rent in a 2 bedroom apt... NaN 7356 Garon 2009-02-03 New York, NY I am an artist(painter, filmmaker) and curato... ... 4.91 4.47 4.52 0 2 2 0 0.30 1 17994.0
4 6848 city scrape Only 2 stops to Manhattan studio Comfortable studio apartment with super comfor... NaN 15991 Allen & Irina 2009-05-06 New York, NY We love to travel. When we travel we like to s... ... 4.80 4.67 4.56 0 1 0 0 1.13 1 17994.0
5 5178 city scrape Large Furnished Room Near B'way Please don’t expect the luxury here just a bas... Theater district, many restaurants around here. 8967 Shunichi 2009-03-03 New York, NY I used to work for a financial industry but no... ... 4.45 4.88 4.39 0 1 1 0 3.38 1 17994.0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
40572 772624737396721045 city scrape Modern Gem 8 Min to JFK Airport Kick back & relax in this calm, stylish, cozy ... NaN 489891925 Germanie 2017-11-19 NaN NaN ... NaN NaN NaN 1 1 0 0 NaN 1 NaN
40573 772683159414917117 city scrape Dahiari Desconecta de tus preocupaciones en este espac... NaN 125534010 Larissa 2020-08-16 Dominican Republic NaN ... NaN NaN NaN 0 1 0 0 NaN 2 NaN
40574 772705452516314073 city scrape Beautiful Basement Your family will be close to everything when y... NaN 338424773 Md 2017-11-19 NaN NaN ... NaN NaN NaN 1 1 1 0 NaN 1 NaN
40575 772710779275911753 city scrape Central Park Close By - 24 This is a Three-Bedroom Apartment. You will ha... NaN 2653479 Richard 2021-11-15 New York, NY I love to travel and meet people. ... NaN NaN NaN 1 37 35 0 NaN 1 NaN
40576 772714221060214808 city scrape Good Vibes at The Bronx Keep it simple at this peaceful and centrally-... NaN 421264574 Aridio 2022-11-07 NaN NaN ... NaN NaN NaN 1 1 0 0 NaN 2 NaN

40576 rows × 52 columns

Split dataset¶

In [ ]:
from sklearn.model_selection import StratifiedShuffleSplit
In [ ]:
df_airbnb = df_airbnb.reset_index()
In [ ]:
# using Stratified Sampling from Scikit Learn
split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)

for train_index, test_index in split.split(df_airbnb, df_airbnb["bedroom"]):
# our stratified train and test set 
    strat_train_set = df_airbnb.loc[train_index]
    strat_test_set = df_airbnb.loc[test_index]
In [ ]:
# dropping our new column that we just made for the purpose of stratified splitting
for set in (strat_train_set, strat_test_set):
    set.drop(['bedroom'], axis=1, inplace=True)
In [ ]:
# Feature matrix
X_train = strat_train_set.drop('price', axis=1)
# Target Variable
y_train = strat_train_set['price']

Data Modelling¶

For evaluation metrics we will be using Mean Absolute Error as it is not affected by outliers unlike Mean Squared Error.

In [ ]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

print('Baseline MAE:', mean_absolute_error(y_train, [y_train.mean()]*len(y_train)))
Baseline MAE: 101.64469801403045

Pipeline¶

We will be using the following the in the initial pipeline:

  • We will be using simple imputer with median for numerical attributes as there are outliers

  • We will be using simple imputer with 'most_frequent' for categorical attributes

  • We will be using One Hot encoder for encoding all categorical attributes but 'host_response_rate', 'host_acceptance_rate'

  • we will be using Ordinal encoding for 'host_response_rate', 'host_acceptance_rate' attributes

  • we will be using standard scaler as linear models and Support Vector Machines work better with scaled values

  • We will be splitting pipelines into numerical and categorical since we will be using different strategy as mentioned above for each

In [ ]:
# lets look at numerical attributes for simple imputer with median 
num_attribs = X_train.select_dtypes(exclude='object')
num_attribs.columns
Out[ ]:
Index(['index', 'id', 'host_id', 'host_since', 'host_is_superhost',
       'host_listings_count', 'latitude', 'longitude', 'accommodates',
       'bedrooms', 'beds', 'minimum_nights', 'maximum_nights',
       'has_availability', 'availability_30', 'availability_365',
       'number_of_reviews', 'number_of_reviews_l30d', 'first_review',
       'last_review', 'review_scores_rating', 'review_scores_accuracy',
       'review_scores_cleanliness', 'review_scores_checkin',
       'review_scores_communication', 'review_scores_location',
       'review_scores_value', 'instant_bookable',
       'calculated_host_listings_count',
       'calculated_host_listings_count_private_rooms',
       'calculated_host_listings_count_shared_rooms', 'reviews_per_month',
       'time_since_first_review'],
      dtype='object')
In [ ]:
# lets look at categorical attributes for simple imputer with 'most_frequent'
cat_attribs = X_train.select_dtypes(include='object')
cat_attribs.columns
Out[ ]:
Index(['source', 'name', 'description', 'neighborhood_overview', 'host_name',
       'host_location', 'host_about', 'host_response_time',
       'host_response_rate', 'host_acceptance_rate', 'neighbourhood',
       'neighbourhood_cleansed', 'neighbourhood_group_cleansed',
       'property_type', 'room_type', 'bathrooms_text', 'amenities',
       'calendar_last_scraped'],
      dtype='object')
In [ ]:
# making a seperate list for cols that we will one hot encode and the other taht we will ordinal encode
cat_attribs_ohe = cat_attribs.drop(['host_response_rate', 'host_acceptance_rate'], axis=1)

cat_attribs_ordinal = cat_attribs[['host_response_rate', 'host_acceptance_rate']]
cat_attribs_ordinal.columns
Out[ ]:
Index(['host_response_rate', 'host_acceptance_rate'], dtype='object')
In [ ]:
cat_attribs_ohe.columns
Out[ ]:
Index(['source', 'name', 'description', 'neighborhood_overview', 'host_name',
       'host_location', 'host_about', 'host_response_time', 'neighbourhood',
       'neighbourhood_cleansed', 'neighbourhood_group_cleansed',
       'property_type', 'room_type', 'bathrooms_text', 'amenities',
       'calendar_last_scraped'],
      dtype='object')

Making a Custom Transformer for Imputing categorical NaN, which will output the result as a dataframe; which we will later pass it on to encoder in the pipeline. I have having an issue of simple imputer spitting out an array, which when passed to ohe for encoding the cat features was throwing an error, as it was expecting a dataframe.

In [ ]:
pip install category-encoders
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting category-encoders
  Downloading category_encoders-2.6.0-py2.py3-none-any.whl (81 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 81.2/81.2 KB 3.9 MB/s eta 0:00:00
Requirement already satisfied: pandas>=1.0.5 in /usr/local/lib/python3.9/dist-packages (from category-encoders) (1.4.4)
Requirement already satisfied: patsy>=0.5.1 in /usr/local/lib/python3.9/dist-packages (from category-encoders) (0.5.3)
Requirement already satisfied: scipy>=1.0.0 in /usr/local/lib/python3.9/dist-packages (from category-encoders) (1.10.1)
Requirement already satisfied: statsmodels>=0.9.0 in /usr/local/lib/python3.9/dist-packages (from category-encoders) (0.13.5)
Requirement already satisfied: numpy>=1.14.0 in /usr/local/lib/python3.9/dist-packages (from category-encoders) (1.22.4)
Requirement already satisfied: scikit-learn>=0.20.0 in /usr/local/lib/python3.9/dist-packages (from category-encoders) (1.2.2)
Requirement already satisfied: python-dateutil>=2.8.1 in /usr/local/lib/python3.9/dist-packages (from pandas>=1.0.5->category-encoders) (2.8.2)
Requirement already satisfied: pytz>=2020.1 in /usr/local/lib/python3.9/dist-packages (from pandas>=1.0.5->category-encoders) (2022.7.1)
Requirement already satisfied: six in /usr/local/lib/python3.9/dist-packages (from patsy>=0.5.1->category-encoders) (1.15.0)
Requirement already satisfied: joblib>=1.1.1 in /usr/local/lib/python3.9/dist-packages (from scikit-learn>=0.20.0->category-encoders) (1.1.1)
Requirement already satisfied: threadpoolctl>=2.0.0 in /usr/local/lib/python3.9/dist-packages (from scikit-learn>=0.20.0->category-encoders) (3.1.0)
Requirement already satisfied: packaging>=21.3 in /usr/local/lib/python3.9/dist-packages (from statsmodels>=0.9.0->category-encoders) (23.0)
Installing collected packages: category-encoders
Successfully installed category-encoders-2.6.0
In [ ]:
# Transformers for the Pipeline
import random
import sklearn
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import StandardScaler
import category_encoders as ce
from category_encoders import OrdinalEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
In [ ]:
from sklearn.base import BaseEstimator, TransformerMixin

class ImputerDF(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.imputer = SimpleImputer(strategy='most_frequent')
        self.cols = []
        
    def fit(self, X, y=None):
        self.imputer.fit(X)
        self.cols = list(X.columns)
        return self
    
    def transform(self, X):
        X_t = self.imputer.transform(X)
        return pd.DataFrame(X_t, columns=self.cols)
In [ ]:
# making sure it works fine and outputs a dataframe before we apply it in the pipeline.
idf = ImputerDF()
idf.fit_transform(X_train.iloc[:5])
Out[ ]:
index id source name description neighborhood_overview host_id host_name host_since host_location ... review_scores_checkin review_scores_communication review_scores_location review_scores_value instant_bookable calculated_host_listings_count calculated_host_listings_count_private_rooms calculated_host_listings_count_shared_rooms reviews_per_month time_since_first_review
0 8817 14472671 city scrape Unique and Cozy 1 bedroom Apt in quiet Pelham Bay Quiet and modern one bedroom apt. with fully e... You can find peace and quiet in this neighborh... 89188080 Honey 2013-11-18 New York, United States ... 4.97 4.93 4.79 4.93 0 1 0 0 3.09 17994.0
1 24484 48316162 city scrape Sonder Flatiron | Accessible Queen Room Start your New York City experience from the h... Centered by one of New York City's most iconic... 219517861 Sonder (NYC) 2019-08-11 New York, NY ... 4.3 4.7 4.6 4.1 0 76 42 0 0.54 17994.0
2 36783 710074059868704221 city scrape Sunny & Bright Private Room in Flatbush, Brooklyn Welcome to our home!<br />Take a break and unw... Located in a serene and treelined section of V... 478199990 Diana 2021-04-12 New York, NY ... 5.0 4.86 4.86 5.0 1 3 3 0 2.96 17994.0
3 22727 45672204 previous scrape **FULLY FURNISHED 1 BEDROOM APARTMENT NYC** Fully furnished 1 bedroom apartment in the hea... Centered by one of New York City's most iconic... 198716390 Vlad 2019-08-11 New York, NY ... 4.0 5.0 4.33 4.67 0 1 0 0 0.12 17994.0
4 7703 12656785 previous scrape Loft @ Williamsburg Bedford Typical Brooklyn industrial Loft.<br />Apartme... Right in the heart of real Williamsburg! Pictu... 19912320 Charles 2016-04-13 New York, NY ... 4.67 5.0 5.0 4.67 0 2 2 0 0.08 17994.0

5 rows × 51 columns

In [ ]:
# Using median as the strategy for Simple Imputer to predict NaN values considering the ouliers
num_pipeline = make_pipeline(
                            SimpleImputer(strategy='median'),
                            StandardScaler()
)
# Added the Custom Transformer to pass on a dataframe to Ordinal Encoder 
# Ordinal Encoder from cetgorical encoders library doesn't work well with np.array
cat_pipeline = make_pipeline(
                            ImputerDF(),
                            OrdinalEncoder(cols = cat_attribs)
)

Using Column transformer we will be putting two pipelines together

In [ ]:
 
In [ ]:
# generating a list of categorical and numerical columns to pass it in the column transformer
cat_attributes = list(cat_attribs)
num_attributes = list(num_attribs)

pipeline = ColumnTransformer([
                            ('num_pipeline', num_pipeline, num_attributes),
                            ('cat_attribs', cat_pipeline, cat_attributes)
])

# we will fit and transform on X_train
X_train_transformed = pipeline.fit_transform(X_train)
---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-89-fb8a487a6939> in <module>
      9 
     10 # we will fit and transform on X_train
---> 11 X_train_transformed = pipeline.fit_transform(X_train)

/usr/local/lib/python3.9/dist-packages/sklearn/utils/_set_output.py in wrapped(self, X, *args, **kwargs)
    138     @wraps(f)
    139     def wrapped(self, X, *args, **kwargs):
--> 140         data_to_wrap = f(self, X, *args, **kwargs)
    141         if isinstance(data_to_wrap, tuple):
    142             # only wrap the first output for cross decomposition

/usr/local/lib/python3.9/dist-packages/sklearn/compose/_column_transformer.py in fit_transform(self, X, y)
    725         self._validate_remainder(X)
    726 
--> 727         result = self._fit_transform(X, y, _fit_transform_one)
    728 
    729         if not result:

/usr/local/lib/python3.9/dist-packages/sklearn/compose/_column_transformer.py in _fit_transform(self, X, y, func, fitted, column_as_strings)
    656         )
    657         try:
--> 658             return Parallel(n_jobs=self.n_jobs)(
    659                 delayed(func)(
    660                     transformer=clone(trans) if not fitted else trans,

/usr/local/lib/python3.9/dist-packages/sklearn/utils/parallel.py in __call__(self, iterable)
     61             for delayed_func, args, kwargs in iterable
     62         )
---> 63         return super().__call__(iterable_with_config)
     64 
     65 

/usr/local/lib/python3.9/dist-packages/joblib/parallel.py in __call__(self, iterable)
   1046             # remaining jobs.
   1047             self._iterating = False
-> 1048             if self.dispatch_one_batch(iterator):
   1049                 self._iterating = self._original_iterator is not None
   1050 

/usr/local/lib/python3.9/dist-packages/joblib/parallel.py in dispatch_one_batch(self, iterator)
    862                 return False
    863             else:
--> 864                 self._dispatch(tasks)
    865                 return True
    866 

/usr/local/lib/python3.9/dist-packages/joblib/parallel.py in _dispatch(self, batch)
    780         with self._lock:
    781             job_idx = len(self._jobs)
--> 782             job = self._backend.apply_async(batch, callback=cb)
    783             # A job can complete so quickly than its callback is
    784             # called before we get here, causing self._jobs to

/usr/local/lib/python3.9/dist-packages/joblib/_parallel_backends.py in apply_async(self, func, callback)
    206     def apply_async(self, func, callback=None):
    207         """Schedule a func to be run"""
--> 208         result = ImmediateResult(func)
    209         if callback:
    210             callback(result)

/usr/local/lib/python3.9/dist-packages/joblib/_parallel_backends.py in __init__(self, batch)
    570         # Don't delay the application, to avoid keeping the input
    571         # arguments in memory
--> 572         self.results = batch()
    573 
    574     def get(self):

/usr/local/lib/python3.9/dist-packages/joblib/parallel.py in __call__(self)
    261         # change the default number of processes to -1
    262         with parallel_backend(self._backend, n_jobs=self._n_jobs):
--> 263             return [func(*args, **kwargs)
    264                     for func, args, kwargs in self.items]
    265 

/usr/local/lib/python3.9/dist-packages/joblib/parallel.py in <listcomp>(.0)
    261         # change the default number of processes to -1
    262         with parallel_backend(self._backend, n_jobs=self._n_jobs):
--> 263             return [func(*args, **kwargs)
    264                     for func, args, kwargs in self.items]
    265 

/usr/local/lib/python3.9/dist-packages/sklearn/utils/parallel.py in __call__(self, *args, **kwargs)
    121             config = {}
    122         with config_context(**config):
--> 123             return self.function(*args, **kwargs)

/usr/local/lib/python3.9/dist-packages/sklearn/pipeline.py in _fit_transform_one(transformer, X, y, weight, message_clsname, message, **fit_params)
    891     with _print_elapsed_time(message_clsname, message):
    892         if hasattr(transformer, "fit_transform"):
--> 893             res = transformer.fit_transform(X, y, **fit_params)
    894         else:
    895             res = transformer.fit(X, y, **fit_params).transform(X)

/usr/local/lib/python3.9/dist-packages/sklearn/pipeline.py in fit_transform(self, X, y, **fit_params)
    435         """
    436         fit_params_steps = self._check_fit_params(**fit_params)
--> 437         Xt = self._fit(X, y, **fit_params_steps)
    438 
    439         last_step = self._final_estimator

/usr/local/lib/python3.9/dist-packages/sklearn/pipeline.py in _fit(self, X, y, **fit_params_steps)
    357                 cloned_transformer = clone(transformer)
    358             # Fit or load from cache the current transformer
--> 359             X, fitted_transformer = fit_transform_one_cached(
    360                 cloned_transformer,
    361                 X,

/usr/local/lib/python3.9/dist-packages/joblib/memory.py in __call__(self, *args, **kwargs)
    347 
    348     def __call__(self, *args, **kwargs):
--> 349         return self.func(*args, **kwargs)
    350 
    351     def call_and_shelve(self, *args, **kwargs):

/usr/local/lib/python3.9/dist-packages/sklearn/pipeline.py in _fit_transform_one(transformer, X, y, weight, message_clsname, message, **fit_params)
    891     with _print_elapsed_time(message_clsname, message):
    892         if hasattr(transformer, "fit_transform"):
--> 893             res = transformer.fit_transform(X, y, **fit_params)
    894         else:
    895             res = transformer.fit(X, y, **fit_params).transform(X)

/usr/local/lib/python3.9/dist-packages/sklearn/utils/_set_output.py in wrapped(self, X, *args, **kwargs)
    138     @wraps(f)
    139     def wrapped(self, X, *args, **kwargs):
--> 140         data_to_wrap = f(self, X, *args, **kwargs)
    141         if isinstance(data_to_wrap, tuple):
    142             # only wrap the first output for cross decomposition

/usr/local/lib/python3.9/dist-packages/sklearn/base.py in fit_transform(self, X, y, **fit_params)
    876         if y is None:
    877             # fit method of arity 1 (unsupervised transformation)
--> 878             return self.fit(X, **fit_params).transform(X)
    879         else:
    880             # fit method of arity 2 (supervised transformation)

/usr/local/lib/python3.9/dist-packages/sklearn/impute/_base.py in fit(self, X, y)
    388             )
    389 
--> 390         X = self._validate_input(X, in_fit=True)
    391 
    392         # default fill_value is 0 for numerical input and "missing_value"

/usr/local/lib/python3.9/dist-packages/sklearn/impute/_base.py in _validate_input(self, X, in_fit)
    325 
    326         try:
--> 327             X = self._validate_data(
    328                 X,
    329                 reset=in_fit,

/usr/local/lib/python3.9/dist-packages/sklearn/base.py in _validate_data(self, X, y, reset, validate_separately, **check_params)
    563             raise ValueError("Validation should be done on X, y or both.")
    564         elif not no_val_X and no_val_y:
--> 565             X = check_array(X, input_name="X", **check_params)
    566             out = X
    567         elif no_val_X and not no_val_y:

/usr/local/lib/python3.9/dist-packages/sklearn/utils/validation.py in check_array(array, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, estimator, input_name)
    776         )
    777         if all(isinstance(dtype_iter, np.dtype) for dtype_iter in dtypes_orig):
--> 778             dtype_orig = np.result_type(*dtypes_orig)
    779 
    780     elif hasattr(array, "iloc") and hasattr(array, "dtype"):

/usr/local/lib/python3.9/dist-packages/numpy/core/overrides.py in result_type(*args, **kwargs)

TypeError: The DType <class 'numpy.dtype[datetime64]'> could not be promoted by <class 'numpy.dtype[float64]'>. This means that no common DType exists for the given inputs. For example they cannot be stored in a single array unless the dtype is `object`. The full list of DTypes is: (<class 'numpy.dtype[int64]'>, <class 'numpy.dtype[int64]'>, <class 'numpy.dtype[int64]'>, <class 'numpy.dtype[datetime64]'>, <class 'numpy.dtype[int64]'>, <class 'numpy.dtype[float64]'>, <class 'numpy.dtype[float64]'>, <class 'numpy.dtype[float64]'>, <class 'numpy.dtype[int64]'>, <class 'numpy.dtype[float64]'>, <class 'numpy.dtype[float64]'>, <class 'numpy.dtype[int64]'>, <class 'numpy.dtype[int64]'>, <class 'numpy.dtype[int64]'>, <class 'numpy.dtype[int64]'>, <class 'numpy.dtype[int64]'>, <class 'numpy.dtype[int64]'>, <class 'numpy.dtype[int64]'>, <class 'numpy.dtype[datetime64]'>, <class 'numpy.dtype[datetime64]'>, <class 'numpy.dtype[float64]'>, <class 'numpy.dtype[float64]'>, <class 'numpy.dtype[float64]'>, <class 'numpy.dtype[float64]'>, <class 'numpy.dtype[float64]'>, <class 'numpy.dtype[float64]'>, <class 'numpy.dtype[float64]'>, <class 'numpy.dtype[int64]'>, <class 'numpy.dtype[int64]'>, <class 'numpy.dtype[int64]'>, <class 'numpy.dtype[int64]'>, <class 'numpy.dtype[float64]'>, <class 'numpy.dtype[float64]'>)
In [ ]:
df_airbnb.head()
Out[ ]:
index id source name description neighborhood_overview host_id host_name host_since host_location ... review_scores_communication review_scores_location review_scores_value instant_bookable calculated_host_listings_count calculated_host_listings_count_private_rooms calculated_host_listings_count_shared_rooms reviews_per_month bedroom time_since_first_review
0 1 5203 previous scrape Cozy Clean Guest Room - Family Apt Our best guests are seeking a safe, clean, spa... Our neighborhood is full of restaurants and ca... 7490 MaryEllen 2009-02-05 New York, NY ... 4.95 4.94 4.92 0 1 1 0 0.73 1 17994.0
1 2 5136 city scrape Spacious Brooklyn Duplex, Patio + Garden We welcome you to stay in our lovely 2 br dupl... NaN 7378 Rebecca 2009-02-03 New York, NY ... 5.00 4.67 5.00 0 1 0 0 0.03 2 17994.0
2 3 5121 city scrape BlissArtsSpace! One room available for rent in a 2 bedroom apt... NaN 7356 Garon 2009-02-03 New York, NY ... 4.91 4.47 4.52 0 2 2 0 0.30 1 17994.0
3 4 6848 city scrape Only 2 stops to Manhattan studio Comfortable studio apartment with super comfor... NaN 15991 Allen & Irina 2009-05-06 New York, NY ... 4.80 4.67 4.56 0 1 0 0 1.13 1 17994.0
4 5 5178 city scrape Large Furnished Room Near B'way Please don’t expect the luxury here just a bas... Theater district, many restaurants around here. 8967 Shunichi 2009-03-03 New York, NY ... 4.45 4.88 4.39 0 1 1 0 3.38 1 17994.0

5 rows × 53 columns

In [ ]:
from google.colab import files
df_airbnb.to_csv('NYC_Listings_filtered.csv') 
files.download('NYC_Listings_filtered.csv')